#Import all required libraries
import warnings
warnings.filterwarnings("ignore")
import plotly.express as px
import seaborn as sns
import pandas as pd
df= pd.read_csv("loan_data.csv")
df
| credit.policy | purpose | int.rate | installment | log.annual.inc | dti | fico | days.with.cr.line | revol.bal | revol.util | inq.last.6mths | delinq.2yrs | pub.rec | not.fully.paid | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | debt_consolidation | 0.1189 | 829.10 | 11.350407 | 19.48 | 737 | 5639.958333 | 28854 | 52.1 | 0 | 0 | 0 | 0 |
| 1 | 1 | credit_card | 0.1071 | 228.22 | 11.082143 | 14.29 | 707 | 2760.000000 | 33623 | 76.7 | 0 | 0 | 0 | 0 |
| 2 | 1 | debt_consolidation | 0.1357 | 366.86 | 10.373491 | 11.63 | 682 | 4710.000000 | 3511 | 25.6 | 1 | 0 | 0 | 0 |
| 3 | 1 | debt_consolidation | 0.1008 | 162.34 | 11.350407 | 8.10 | 712 | 2699.958333 | 33667 | 73.2 | 1 | 0 | 0 | 0 |
| 4 | 1 | credit_card | 0.1426 | 102.92 | 11.299732 | 14.97 | 667 | 4066.000000 | 4740 | 39.5 | 0 | 1 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9573 | 0 | all_other | 0.1461 | 344.76 | 12.180755 | 10.39 | 672 | 10474.000000 | 215372 | 82.1 | 2 | 0 | 0 | 1 |
| 9574 | 0 | all_other | 0.1253 | 257.70 | 11.141862 | 0.21 | 722 | 4380.000000 | 184 | 1.1 | 5 | 0 | 0 | 1 |
| 9575 | 0 | debt_consolidation | 0.1071 | 97.81 | 10.596635 | 13.09 | 687 | 3450.041667 | 10036 | 82.9 | 8 | 0 | 0 | 1 |
| 9576 | 0 | home_improvement | 0.1600 | 351.58 | 10.819778 | 19.18 | 692 | 1800.000000 | 0 | 3.2 | 5 | 0 | 0 | 1 |
| 9577 | 0 | debt_consolidation | 0.1392 | 853.43 | 11.264464 | 16.28 | 732 | 4740.000000 | 37879 | 57.0 | 6 | 0 | 0 | 1 |
9578 rows × 14 columns
#Get the description of the data
df.describe()
| credit.policy | int.rate | installment | log.annual.inc | dti | fico | days.with.cr.line | revol.bal | revol.util | inq.last.6mths | delinq.2yrs | pub.rec | not.fully.paid | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 | 9.578000e+03 | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 |
| mean | 0.804970 | 0.122640 | 319.089413 | 10.932117 | 12.606679 | 710.846314 | 4560.767197 | 1.691396e+04 | 46.799236 | 1.577469 | 0.163708 | 0.062122 | 0.160054 |
| std | 0.396245 | 0.026847 | 207.071301 | 0.614813 | 6.883970 | 37.970537 | 2496.930377 | 3.375619e+04 | 29.014417 | 2.200245 | 0.546215 | 0.262126 | 0.366676 |
| min | 0.000000 | 0.060000 | 15.670000 | 7.547502 | 0.000000 | 612.000000 | 178.958333 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 1.000000 | 0.103900 | 163.770000 | 10.558414 | 7.212500 | 682.000000 | 2820.000000 | 3.187000e+03 | 22.600000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 1.000000 | 0.122100 | 268.950000 | 10.928884 | 12.665000 | 707.000000 | 4139.958333 | 8.596000e+03 | 46.300000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
| 75% | 1.000000 | 0.140700 | 432.762500 | 11.291293 | 17.950000 | 737.000000 | 5730.000000 | 1.824950e+04 | 70.900000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 |
| max | 1.000000 | 0.216400 | 940.140000 | 14.528354 | 29.960000 | 827.000000 | 17639.958330 | 1.207359e+06 | 119.000000 | 33.000000 | 13.000000 | 5.000000 | 1.000000 |
#Get information of dataset and also check any null value or not.
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9578 entries, 0 to 9577 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 credit.policy 9578 non-null int64 1 purpose 9578 non-null object 2 int.rate 9578 non-null float64 3 installment 9578 non-null float64 4 log.annual.inc 9578 non-null float64 5 dti 9578 non-null float64 6 fico 9578 non-null int64 7 days.with.cr.line 9578 non-null float64 8 revol.bal 9578 non-null int64 9 revol.util 9578 non-null float64 10 inq.last.6mths 9578 non-null int64 11 delinq.2yrs 9578 non-null int64 12 pub.rec 9578 non-null int64 13 not.fully.paid 9578 non-null int64 dtypes: float64(6), int64(7), object(1) memory usage: 1.0+ MB
#Check data types of all feature present in dataset.
df.dtypes
credit.policy int64 purpose object int.rate float64 installment float64 log.annual.inc float64 dti float64 fico int64 days.with.cr.line float64 revol.bal int64 revol.util float64 inq.last.6mths int64 delinq.2yrs int64 pub.rec int64 not.fully.paid int64 dtype: object
#Get the shape of dataset (number of rows and columns)
df.shape
(9578, 14)
#To count the frequency of each feature
df.count
<bound method DataFrame.count of credit.policy purpose int.rate installment \
0 1 debt_consolidation 0.1189 829.10
1 1 credit_card 0.1071 228.22
2 1 debt_consolidation 0.1357 366.86
3 1 debt_consolidation 0.1008 162.34
4 1 credit_card 0.1426 102.92
... ... ... ... ...
9573 0 all_other 0.1461 344.76
9574 0 all_other 0.1253 257.70
9575 0 debt_consolidation 0.1071 97.81
9576 0 home_improvement 0.1600 351.58
9577 0 debt_consolidation 0.1392 853.43
log.annual.inc dti fico days.with.cr.line revol.bal revol.util \
0 11.350407 19.48 737 5639.958333 28854 52.1
1 11.082143 14.29 707 2760.000000 33623 76.7
2 10.373491 11.63 682 4710.000000 3511 25.6
3 11.350407 8.10 712 2699.958333 33667 73.2
4 11.299732 14.97 667 4066.000000 4740 39.5
... ... ... ... ... ... ...
9573 12.180755 10.39 672 10474.000000 215372 82.1
9574 11.141862 0.21 722 4380.000000 184 1.1
9575 10.596635 13.09 687 3450.041667 10036 82.9
9576 10.819778 19.18 692 1800.000000 0 3.2
9577 11.264464 16.28 732 4740.000000 37879 57.0
inq.last.6mths delinq.2yrs pub.rec not.fully.paid
0 0 0 0 0
1 0 0 0 0
2 1 0 0 0
3 1 0 0 0
4 0 1 0 0
... ... ... ... ...
9573 2 0 0 1
9574 5 0 0 1
9575 8 0 0 1
9576 5 0 0 1
9577 6 0 0 1
[9578 rows x 14 columns]>
df.value_counts()
credit.policy purpose int.rate installment log.annual.inc dti fico days.with.cr.line revol.bal revol.util inq.last.6mths delinq.2yrs pub.rec not.fully.paid
0 all_other 0.0712 30.94 10.819778 1.10 772 5910.000000 0 0.8 3 0 0 0 1
1 debt_consolidation 0.1253 284.47 10.668955 9.27 687 2850.000000 3294 18.7 3 1 0 1 1
11.018564 16.11 697 2459.958333 9836 69.8 1 0 0 0 1
11.147397 22.05 692 7020.000000 33321 83.5 1 0 0 0 1
301.20 10.596635 14.46 692 3630.000000 11786 42.2 0 0 0 0 1
..
all_other 0.1284 201.71 10.357743 5.83 742 11069.958330 7699 69.4 1 0 0 1 1
10.645425 15.29 692 5219.958333 5826 31.5 0 1 0 0 1
221.88 11.156251 5.26 742 2850.000000 993 24.8 0 0 0 0 1
235.33 9.574983 3.00 707 1199.958333 3449 19.7 2 0 0 1 1
small_business 0.2121 755.69 11.248960 5.72 672 1440.000000 500 83.3 1 0 0 0 1
Length: 9578, dtype: int64
# Import matplotlib library to plot graph and do distribution of all features in dataset using histogram
from matplotlib import pyplot as plt
for fea1 in df.columns:
plt.hist(df[fea1])
plt.title("Feature:"+fea1)
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.show()
corr_data = df.corr()
plt.figure(figsize=(10,10))
sns.heatmap(corr_data, cmap='BuPu', annot= True)
plt.title("Correlation Matrix")
plt.show()
plt.savefig('Corr.jpeg')
<Figure size 640x480 with 0 Axes>
#To check duplicates in dataset
df.duplicated()
0 False
1 False
2 False
3 False
4 False
...
9573 False
9574 False
9575 False
9576 False
9577 False
Length: 9578, dtype: bool
df.isnull().sum()
credit.policy 0 purpose 0 int.rate 0 installment 0 log.annual.inc 0 dti 0 fico 0 days.with.cr.line 0 revol.bal 0 revol.util 0 inq.last.6mths 0 delinq.2yrs 0 pub.rec 0 not.fully.paid 0 dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9578 entries, 0 to 9577 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 credit.policy 9578 non-null int64 1 purpose 9578 non-null object 2 int.rate 9578 non-null float64 3 installment 9578 non-null float64 4 log.annual.inc 9578 non-null float64 5 dti 9578 non-null float64 6 fico 9578 non-null int64 7 days.with.cr.line 9578 non-null float64 8 revol.bal 9578 non-null int64 9 revol.util 9578 non-null float64 10 inq.last.6mths 9578 non-null int64 11 delinq.2yrs 9578 non-null int64 12 pub.rec 9578 non-null int64 13 not.fully.paid 9578 non-null int64 dtypes: float64(6), int64(7), object(1) memory usage: 1.0+ MB
df.purpose.value_counts()
debt_consolidation 3957 all_other 2331 credit_card 1262 home_improvement 629 small_business 619 major_purchase 437 educational 343 Name: purpose, dtype: int64
#Check for the outliers of all features in dataset using boxplots
for col1 in df.columns:
fig = px.box(df, x=col1)
fig.show()
#fig = px.box(df, x="purpose")
#fig.show()
#fig = px.box(df, x="int.rate")
#fig.show()
#fig = px.box(df, x="installment")
#fig.show()
#fig = px.box(df, x="log.annual.inc")
#fig.show()
#Now try to convert that outliers to median
import numpy as np
df1= df
def replace_outliers(column):
median = np.median(column)
std_dev = np.std(column)
outliers = (column - median).abs() > (2 * std_dev)
column[outliers] = median
return column
df1
| credit.policy | purpose | int.rate | installment | log.annual.inc | dti | fico | days.with.cr.line | revol.bal | revol.util | inq.last.6mths | delinq.2yrs | pub.rec | not.fully.paid | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | debt_consolidation | 0.1189 | 829.10 | 11.350407 | 19.48 | 737 | 5639.958333 | 28854 | 52.1 | 0 | 0 | 0 | 0 |
| 1 | 1 | credit_card | 0.1071 | 228.22 | 11.082143 | 14.29 | 707 | 2760.000000 | 33623 | 76.7 | 0 | 0 | 0 | 0 |
| 2 | 1 | debt_consolidation | 0.1357 | 366.86 | 10.373491 | 11.63 | 682 | 4710.000000 | 3511 | 25.6 | 1 | 0 | 0 | 0 |
| 3 | 1 | debt_consolidation | 0.1008 | 162.34 | 11.350407 | 8.10 | 712 | 2699.958333 | 33667 | 73.2 | 1 | 0 | 0 | 0 |
| 4 | 1 | credit_card | 0.1426 | 102.92 | 11.299732 | 14.97 | 667 | 4066.000000 | 4740 | 39.5 | 0 | 1 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9573 | 0 | all_other | 0.1461 | 344.76 | 12.180755 | 10.39 | 672 | 10474.000000 | 215372 | 82.1 | 2 | 0 | 0 | 1 |
| 9574 | 0 | all_other | 0.1253 | 257.70 | 11.141862 | 0.21 | 722 | 4380.000000 | 184 | 1.1 | 5 | 0 | 0 | 1 |
| 9575 | 0 | debt_consolidation | 0.1071 | 97.81 | 10.596635 | 13.09 | 687 | 3450.041667 | 10036 | 82.9 | 8 | 0 | 0 | 1 |
| 9576 | 0 | home_improvement | 0.1600 | 351.58 | 10.819778 | 19.18 | 692 | 1800.000000 | 0 | 3.2 | 5 | 0 | 0 | 1 |
| 9577 | 0 | debt_consolidation | 0.1392 | 853.43 | 11.264464 | 16.28 | 732 | 4740.000000 | 37879 | 57.0 | 6 | 0 | 0 | 1 |
9578 rows × 14 columns
df1['log.annual.inc'] = replace_outliers(df1['log.annual.inc'])
df1.describe()
| credit.policy | int.rate | installment | log.annual.inc | dti | fico | days.with.cr.line | revol.bal | revol.util | inq.last.6mths | delinq.2yrs | pub.rec | not.fully.paid | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 | 9.578000e+03 | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 |
| mean | 0.804970 | 0.122640 | 319.089413 | 10.930562 | 12.606679 | 710.846314 | 4560.767197 | 1.691396e+04 | 46.799236 | 1.577469 | 0.163708 | 0.062122 | 0.160054 |
| std | 0.396245 | 0.026847 | 207.071301 | 0.489358 | 6.883970 | 37.970537 | 2496.930377 | 3.375619e+04 | 29.014417 | 2.200245 | 0.546215 | 0.262126 | 0.366676 |
| min | 0.000000 | 0.060000 | 15.670000 | 9.700147 | 0.000000 | 612.000000 | 178.958333 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 1.000000 | 0.103900 | 163.770000 | 10.596635 | 7.212500 | 682.000000 | 2820.000000 | 3.187000e+03 | 22.600000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 1.000000 | 0.122100 | 268.950000 | 10.928884 | 12.665000 | 707.000000 | 4139.958333 | 8.596000e+03 | 46.300000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
| 75% | 1.000000 | 0.140700 | 432.762500 | 11.264464 | 17.950000 | 737.000000 | 5730.000000 | 1.824950e+04 | 70.900000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 |
| max | 1.000000 | 0.216400 | 940.140000 | 12.154779 | 29.960000 | 827.000000 | 17639.958330 | 1.207359e+06 | 119.000000 | 33.000000 | 13.000000 | 5.000000 | 1.000000 |
df1['installment'] = replace_outliers(df1['installment'])
df1['days.with.cr.line'] = replace_outliers(df1['days.with.cr.line'])
df1['revol.bal'] = replace_outliers(df1['revol.bal'])
fig = px.box(df1, x="installment")
fig.show()
fig = px.box(df1, x="log.annual.inc")
fig.show()
fig = px.box(df1, x="days.with.cr.line")
fig.show()
fig = px.box(df1, x="revol.bal")
fig.show()
df1.shape
(9578, 14)
df1.describe()
| credit.policy | int.rate | installment | log.annual.inc | dti | fico | days.with.cr.line | revol.bal | revol.util | inq.last.6mths | delinq.2yrs | pub.rec | not.fully.paid | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 |
| mean | 0.804970 | 0.122640 | 279.863678 | 10.930562 | 12.606679 | 710.846314 | 4180.397064 | 12427.744832 | 46.799236 | 1.577469 | 0.163708 | 0.062122 | 0.160054 |
| std | 0.396245 | 0.026847 | 155.985226 | 0.489358 | 6.883970 | 37.970537 | 1883.718454 | 13513.919075 | 29.014417 | 2.200245 | 0.546215 | 0.262126 | 0.366676 |
| min | 0.000000 | 0.060000 | 15.670000 | 9.700147 | 0.000000 | 612.000000 | 178.958333 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 1.000000 | 0.103900 | 163.770000 | 10.596635 | 7.212500 | 682.000000 | 2820.000000 | 3187.000000 | 22.600000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 1.000000 | 0.122100 | 268.950000 | 10.928884 | 12.665000 | 707.000000 | 4139.958333 | 8596.000000 | 46.300000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
| 75% | 1.000000 | 0.140700 | 367.497500 | 11.264464 | 17.950000 | 737.000000 | 5340.041667 | 16525.250000 | 70.900000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 |
| max | 1.000000 | 0.216400 | 682.760000 | 12.154779 | 29.960000 | 827.000000 | 9120.958333 | 75854.000000 | 119.000000 | 33.000000 | 13.000000 | 5.000000 | 1.000000 |
#Get which feature is object
df1.select_dtypes(include=[object])
| purpose | |
|---|---|
| 0 | debt_consolidation |
| 1 | credit_card |
| 2 | debt_consolidation |
| 3 | debt_consolidation |
| 4 | credit_card |
| ... | ... |
| 9573 | all_other |
| 9574 | all_other |
| 9575 | debt_consolidation |
| 9576 | home_improvement |
| 9577 | debt_consolidation |
9578 rows × 1 columns
#Now change above object to numerical (0's or 1's)
from sklearn.preprocessing import LabelEncoder
df1['purpose']=LabelEncoder().fit_transform(df1['purpose'])
df1.head()
| credit.policy | purpose | int.rate | installment | log.annual.inc | dti | fico | days.with.cr.line | revol.bal | revol.util | inq.last.6mths | delinq.2yrs | pub.rec | not.fully.paid | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2 | 0.1189 | 268.95 | 11.350407 | 19.48 | 737 | 5639.958333 | 28854 | 52.1 | 0 | 0 | 0 | 0 |
| 1 | 1 | 1 | 0.1071 | 228.22 | 11.082143 | 14.29 | 707 | 2760.000000 | 33623 | 76.7 | 0 | 0 | 0 | 0 |
| 2 | 1 | 2 | 0.1357 | 366.86 | 10.373491 | 11.63 | 682 | 4710.000000 | 3511 | 25.6 | 1 | 0 | 0 | 0 |
| 3 | 1 | 2 | 0.1008 | 162.34 | 11.350407 | 8.10 | 712 | 2699.958333 | 33667 | 73.2 | 1 | 0 | 0 | 0 |
| 4 | 1 | 1 | 0.1426 | 102.92 | 11.299732 | 14.97 | 667 | 4066.000000 | 4740 | 39.5 | 0 | 1 | 0 | 0 |
#Rescale features to a narrow range using a normlization
from sklearn.preprocessing import MinMaxScaler
re1 = pd.DataFrame(df1)
min_max = MinMaxScaler()
norm = min_max.fit_transform(re1)
norm
array([[1. , 0.33333333, 0.37659847, ..., 0. , 0. ,
0. ],
[1. , 0.16666667, 0.3011509 , ..., 0. , 0. ,
0. ],
[1. , 0.33333333, 0.48401535, ..., 0. , 0. ,
0. ],
...,
[0. , 0.33333333, 0.3011509 , ..., 0. , 0. ,
1. ],
[0. , 0.66666667, 0.63938619, ..., 0. , 0. ,
1. ],
[0. , 0.33333333, 0.50639386, ..., 0. , 0. ,
1. ]])
df1
| credit.policy | purpose | int.rate | installment | log.annual.inc | dti | fico | days.with.cr.line | revol.bal | revol.util | inq.last.6mths | delinq.2yrs | pub.rec | not.fully.paid | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2 | 0.1189 | 268.95 | 11.350407 | 19.48 | 737 | 5639.958333 | 28854 | 52.1 | 0 | 0 | 0 | 0 |
| 1 | 1 | 1 | 0.1071 | 228.22 | 11.082143 | 14.29 | 707 | 2760.000000 | 33623 | 76.7 | 0 | 0 | 0 | 0 |
| 2 | 1 | 2 | 0.1357 | 366.86 | 10.373491 | 11.63 | 682 | 4710.000000 | 3511 | 25.6 | 1 | 0 | 0 | 0 |
| 3 | 1 | 2 | 0.1008 | 162.34 | 11.350407 | 8.10 | 712 | 2699.958333 | 33667 | 73.2 | 1 | 0 | 0 | 0 |
| 4 | 1 | 1 | 0.1426 | 102.92 | 11.299732 | 14.97 | 667 | 4066.000000 | 4740 | 39.5 | 0 | 1 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9573 | 0 | 0 | 0.1461 | 344.76 | 10.928884 | 10.39 | 672 | 4139.958333 | 8596 | 82.1 | 2 | 0 | 0 | 1 |
| 9574 | 0 | 0 | 0.1253 | 257.70 | 11.141862 | 0.21 | 722 | 4380.000000 | 184 | 1.1 | 5 | 0 | 0 | 1 |
| 9575 | 0 | 2 | 0.1071 | 97.81 | 10.596635 | 13.09 | 687 | 3450.041667 | 10036 | 82.9 | 8 | 0 | 0 | 1 |
| 9576 | 0 | 4 | 0.1600 | 351.58 | 10.819778 | 19.18 | 692 | 1800.000000 | 0 | 3.2 | 5 | 0 | 0 | 1 |
| 9577 | 0 | 2 | 0.1392 | 268.95 | 11.264464 | 16.28 | 732 | 4740.000000 | 37879 | 57.0 | 6 | 0 | 0 | 1 |
9578 rows × 14 columns
#Create a graph of fico and Credit Policy
sns.set_style('whitegrid')
plt.hist(df1['fico'].loc[df1['credit.policy']==1], bins=15, label='Credit.Policy=1')
plt.hist(df1['fico'].loc[df1['credit.policy']==0], bins=15, label='Credit.Policy=0')
plt.legend()
plt.xlabel('FICO')
plt.ylabel('Frequency')
Text(0, 0.5, 'Frequency')
df1[df1['not.fully.paid']==1]['fico'].hist(bins=30, alpha=0.5, color='blue', label='not.fully.paid=1')
df1[df1['not.fully.paid']==0]['fico'].hist(bins=30, alpha=0.5, color='green', label='not.fully.paid=0')
plt.legend()
plt.xlabel('FICO')
Text(0.5, 0, 'FICO')
#Use countplot to see the count of all the purpose of loans by not.fully.paid
sns.countplot(data=df1, x='purpose', hue='not.fully.paid')
<AxesSubplot:xlabel='purpose', ylabel='count'>
#To train model, first seperate input features and target variable
x = df1.drop('not.fully.paid', axis=1)
y = df1['not.fully.paid']
#input features data
x
| credit.policy | purpose | int.rate | installment | log.annual.inc | dti | fico | days.with.cr.line | revol.bal | revol.util | inq.last.6mths | delinq.2yrs | pub.rec | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2 | 0.1189 | 268.95 | 11.350407 | 19.48 | 737 | 5639.958333 | 28854 | 52.1 | 0 | 0 | 0 |
| 1 | 1 | 1 | 0.1071 | 228.22 | 11.082143 | 14.29 | 707 | 2760.000000 | 33623 | 76.7 | 0 | 0 | 0 |
| 2 | 1 | 2 | 0.1357 | 366.86 | 10.373491 | 11.63 | 682 | 4710.000000 | 3511 | 25.6 | 1 | 0 | 0 |
| 3 | 1 | 2 | 0.1008 | 162.34 | 11.350407 | 8.10 | 712 | 2699.958333 | 33667 | 73.2 | 1 | 0 | 0 |
| 4 | 1 | 1 | 0.1426 | 102.92 | 11.299732 | 14.97 | 667 | 4066.000000 | 4740 | 39.5 | 0 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9573 | 0 | 0 | 0.1461 | 344.76 | 10.928884 | 10.39 | 672 | 4139.958333 | 8596 | 82.1 | 2 | 0 | 0 |
| 9574 | 0 | 0 | 0.1253 | 257.70 | 11.141862 | 0.21 | 722 | 4380.000000 | 184 | 1.1 | 5 | 0 | 0 |
| 9575 | 0 | 2 | 0.1071 | 97.81 | 10.596635 | 13.09 | 687 | 3450.041667 | 10036 | 82.9 | 8 | 0 | 0 |
| 9576 | 0 | 4 | 0.1600 | 351.58 | 10.819778 | 19.18 | 692 | 1800.000000 | 0 | 3.2 | 5 | 0 | 0 |
| 9577 | 0 | 2 | 0.1392 | 268.95 | 11.264464 | 16.28 | 732 | 4740.000000 | 37879 | 57.0 | 6 | 0 | 0 |
9578 rows × 13 columns
#target varaiable data
y
0 0
1 0
2 0
3 0
4 0
..
9573 1
9574 1
9575 1
9576 1
9577 1
Name: not.fully.paid, Length: 9578, dtype: int64
#Now seperate the data in 30% for test and 70% for train
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, random_state=34)
xtrain
| credit.policy | purpose | int.rate | installment | log.annual.inc | dti | fico | days.with.cr.line | revol.bal | revol.util | inq.last.6mths | delinq.2yrs | pub.rec | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1953 | 1 | 0 | 0.1442 | 515.74 | 11.141862 | 14.26 | 677 | 4530.041667 | 13235 | 47.8 | 1 | 0 | 0 |
| 975 | 1 | 2 | 0.1039 | 207.69 | 11.373663 | 14.79 | 712 | 5430.000000 | 21532 | 66.9 | 2 | 0 | 0 |
| 1619 | 1 | 0 | 0.1241 | 668.21 | 10.545341 | 1.96 | 787 | 8641.000000 | 507 | 0.3 | 7 | 0 | 0 |
| 6684 | 1 | 0 | 0.0894 | 317.72 | 11.271019 | 0.61 | 782 | 3390.041667 | 951 | 3.9 | 1 | 0 | 0 |
| 5195 | 1 | 0 | 0.1218 | 79.92 | 10.714418 | 20.67 | 697 | 2580.000000 | 563 | 4.5 | 1 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5667 | 1 | 0 | 0.0859 | 316.11 | 11.289782 | 6.52 | 802 | 5460.000000 | 2010 | 2.1 | 1 | 0 | 0 |
| 324 | 1 | 6 | 0.1236 | 268.95 | 10.778956 | 0.85 | 682 | 2579.000000 | 639 | 13.6 | 1 | 0 | 0 |
| 3157 | 1 | 0 | 0.1316 | 50.66 | 10.645425 | 17.26 | 672 | 3457.958333 | 2694 | 72.8 | 1 | 0 | 1 |
| 5993 | 1 | 2 | 0.1218 | 479.52 | 10.239960 | 0.00 | 787 | 4860.041667 | 0 | 0.0 | 3 | 0 | 0 |
| 5242 | 1 | 2 | 0.1357 | 594.45 | 10.969921 | 13.45 | 692 | 2519.958333 | 5799 | 76.3 | 0 | 0 | 0 |
6704 rows × 13 columns
ytrain
1953 0
975 0
1619 0
6684 0
5195 0
..
5667 0
324 1
3157 1
5993 0
5242 0
Name: not.fully.paid, Length: 6704, dtype: int64
xtest
| credit.policy | purpose | int.rate | installment | log.annual.inc | dti | fico | days.with.cr.line | revol.bal | revol.util | inq.last.6mths | delinq.2yrs | pub.rec | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1836 | 1 | 2 | 0.1411 | 229.34 | 10.646615 | 23.67 | 667 | 4980.041667 | 10106 | 40.3 | 1 | 0 | 0 |
| 180 | 1 | 1 | 0.1028 | 485.99 | 11.385092 | 11.95 | 702 | 5070.000000 | 24615 | 94.7 | 0 | 0 | 0 |
| 8720 | 0 | 2 | 0.1450 | 258.16 | 11.918391 | 26.27 | 697 | 5070.000000 | 8596 | 95.4 | 2 | 0 | 0 |
| 7229 | 1 | 3 | 0.1062 | 97.68 | 10.184900 | 10.60 | 717 | 3840.041667 | 7154 | 68.8 | 0 | 0 | 0 |
| 2890 | 1 | 2 | 0.1322 | 676.02 | 11.264464 | 0.65 | 727 | 1698.958333 | 5142 | 27.2 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3670 | 1 | 2 | 0.1568 | 87.51 | 10.419301 | 14.29 | 667 | 3809.958333 | 286 | 95.3 | 0 | 2 | 0 |
| 7914 | 0 | 2 | 0.1186 | 23.21 | 10.060491 | 11.33 | 657 | 2879.958333 | 4479 | 101.8 | 1 | 0 | 0 |
| 1013 | 1 | 0 | 0.1292 | 195.21 | 10.819778 | 18.01 | 672 | 2249.958333 | 7423 | 53.4 | 0 | 0 | 0 |
| 8411 | 0 | 2 | 0.1513 | 416.75 | 11.198215 | 15.73 | 652 | 5490.000000 | 22917 | 38.6 | 3 | 1 | 0 |
| 9060 | 0 | 4 | 0.1316 | 149.44 | 11.264464 | 15.83 | 697 | 3359.958333 | 38558 | 64.7 | 4 | 0 | 0 |
2874 rows × 13 columns
ytest
1836 0
180 0
8720 0
7229 0
2890 1
..
3670 0
7914 1
1013 1
8411 0
9060 0
Name: not.fully.paid, Length: 2874, dtype: int64
#Make prediction of the dataset using KNN algorithm
from sklearn.neighbors import KNeighborsClassifier
knn_classi = KNeighborsClassifier(n_neighbors=3)
knn_classi.fit(xtrain, ytrain)
knn_pred = knn_classi.predict(xtest)
knn_pred
array([0, 0, 1, ..., 0, 0, 0], dtype=int64)
#Use SVM algorithm to predict the value
from sklearn import svm
svm_classi = svm.SVC(kernel='linear')
svm_classi.fit(xtrain, ytrain)
svm_pred = svm_classi.predict(xtest)
svm_pred
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
#Use DecisionTree algorithm
from sklearn.tree import DecisionTreeClassifier
d_tree = DecisionTreeClassifier()
d_tree.fit(xtrain, ytrain)
d_tree_pred = d_tree.predict(xtest)
d_tree_pred
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
#Import the GridSearchCV library to find the best hyperparameters for decision tree.
from sklearn.model_selection import GridSearchCV
dtree = DecisionTreeClassifier()
#Define parameter grid to perform better search
param = {
'criterion': ['gini', 'entropy'],
'max_depth': [1,2,3,4,5],
'min_samples_leaf': [1,2,3,4,5]
}
#Create a search grid object
grid_search = GridSearchCV(estimator=dtree, param_grid=param, cv=5)
#Now fit the train data to perform Grid Search
grid_search.fit(xtrain, ytrain)
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
param_grid={'criterion': ['gini', 'entropy'],
'max_depth': [1, 2, 3, 4, 5],
'min_samples_leaf': [1, 2, 3, 4, 5]})
#Print the best parameter of the Grid Search
print(grid_search.best_params_)
{'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 1}
dtree_best = DecisionTreeClassifier(criterion='gini', max_depth=3, min_samples_leaf=2)
dtree_best.fit(xtrain, ytrain)
dtree_best_pred = dtree_best.predict(xtest)
print("Decision Tree Prediction using best parameters:", dtree_best_pred)
Decision Tree Prediction using best parameters: [0 0 0 ... 0 0 0]
#Import metrics library from sklearn which has precision, recall, accuracy, F-measure
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
#Compare the results of all Classifier used in 3b and 4b
#For KNN Classifier
pre_knn = precision_score(ytest, knn_pred)
recall_knn = recall_score(ytest, knn_pred)
accu_knn = accuracy_score(ytest, knn_pred)
f_score_knn = f1_score(ytest, knn_pred)
#For SVM Classifier
pre_svm = precision_score(ytest, svm_pred)
recall_svm = recall_score(ytest, svm_pred)
accu_svm = accuracy_score(ytest, svm_pred)
f_score_svm = f1_score(ytest, svm_pred)
#For Decision Tree Classifier
pre_dtc = precision_score(ytest, d_tree_pred)
recall_dtc = recall_score(ytest, d_tree_pred)
accu_dtc = accuracy_score(ytest, d_tree_pred)
f_score_dtc = f1_score(ytest, d_tree_pred)
#For Decision Tree Classifier using GridSearchCV
pre_dtree_grid = precision_score(ytest, dtree_best_pred)
recall_dtree_grid = recall_score(ytest, dtree_best_pred)
accu_dtree_grid = accuracy_score(ytest, dtree_best_pred)
f_score_grid = f1_score(ytest, dtree_best_pred)
#Print the output of all the classifier
#KNN
print('\n#KNN Result')
print("Precision Score of KNN: {:.2f}".format(pre_knn))
print("Recall Score of KNN: {:.2f}".format(recall_knn))
print("Accuracy of KNN: {:.2f}".format(accu_knn))
print("F1 Score of KNN: {:.2f}".format(f_score_knn))
#SVM
print("\n#SVM Result")
print("Precision Score of SVM: {:.2f}".format(pre_svm))
print("Recall Score of SVM: {:.2f}".format(recall_svm))
print("Accuracy of SVM: {:.2f}".format(accu_svm))
print("F1 Score of SVM: {:.2f}".format(f_score_svm))
#Decision Tree Classifier
print("\n#Decision Tree Classifier")
print("Precision Score of DTC: {:.2f}".format(pre_dtc))
print("Recall Score of DTC: {:.2f}".format(recall_dtc))
print("Accuracy of DTC: {:.2f}".format(accu_dtc))
print("F1 Score of DTC: {:.2f}".format(f_score_dtc))
#Decision Tree Classifier with GridSearchCV
print('\n#Decision Tree Classifier with GridSearchCV')
print("Precision Score of DTG: {:.2f}".format(pre_dtree_grid))
print("Recall Score of DTG: {:.2f}".format(recall_dtree_grid))
print("Accuracy of DTG: {:.2f}".format(accu_dtree_grid))
print("F1 Score of DTG: {:.2f}".format(f_score_grid))
#KNN Result Precision Score of KNN: 0.18 Recall Score of KNN: 0.08 Accuracy of KNN: 0.79 F1 Score of KNN: 0.11 #SVM Result Precision Score of SVM: 0.29 Recall Score of SVM: 0.05 Accuracy of SVM: 0.82 F1 Score of SVM: 0.08 #Decision Tree Classifier Precision Score of DTC: 0.21 Recall Score of DTC: 0.23 Accuracy of DTC: 0.73 F1 Score of DTC: 0.22 #Decision Tree Classifier with GridSearchCV Precision Score of DTG: 0.00 Recall Score of DTG: 0.00 Accuracy of DTG: 0.83 F1 Score of DTG: 0.00
from sklearn.metrics import roc_curve, roc_auc_score
#Calculate fpr, tpr, thresholds and accuracy for KNN
fpr_knn, tpr_knn, thre_knn = roc_curve(ytest, knn_pred)
auc_knn = roc_auc_score(ytest, knn_pred)
#Calculate fpr, tpr, thresholds and accuracy for SVM
fpr_svm, tpr_svm, thre_svm = roc_curve(ytest, svm_pred)
auc_svm = roc_auc_score(ytest, svm_pred)
#Calculate fpr, tpr, thresholds and accuracy for Decision Tree
fpr_dtree, tpr_dtree, thre_dtree = roc_curve(ytest, d_tree_pred)
auc_dtree = roc_auc_score(ytest, d_tree_pred)
#Calculate fpr, tpr, thresholds and accuracy for Decision Tree Best GridSearch
fpr_dtcg, tpr_dtcg, thre_dtcg = roc_curve(ytest, dtree_best_pred)
auc_dtcg = roc_auc_score(ytest, dtree_best_pred)
#Label the axis
plt.title("ROC Curve")
plt.xlabel('False Positive Rate')
plt.ylabel('Ture Positive Rate')
#Plot the ROC Curve for Each Model
plt.plot(fpr_knn, tpr_knn, label='KNN {:.2f}'.format(auc_knn))
plt.plot(fpr_svm, tpr_svm, label='SVM {:.2f}'.format(auc_svm))
plt.plot(fpr_dtree, tpr_dtree, label='DT {:.2f}'.format(auc_dtree))
plt.plot(fpr_dtcg, tpr_dtcg, label='DTGS {:.2f}'.format(auc_dtcg))
plt.legend()
plt.show()